Intro

This document is a lab-book for visual data minining carried out on the CHUSJ dataset.

# Read dataset from SPSS file
BD_chusj <- haven::read_sav(file.path("data", "BD_chusj_MAIN.sav"))
BD_chusj <- BD_chusj %>% zap_formats %>% zap_label %>% zap_widths






Data Characterization

Age

hist(BD_chusj$Age, col = "azure3",
     main = "Histogram of Age",
     xlab = "Age" )

# Add boxplot
par(new = TRUE)
boxplot(BD_chusj$Age,
        horizontal = TRUE,
        axes = FALSE,
        boxwex = 0.15,
        col = rgb(0.81, 0.85, 0.9, alpha = 0.5))

Date Of First Positive Lab Result

FirstDate <- 
  BD_chusj %>% 
  select(DateOfFirstPositiveLabResult) %>% 
  group_by(date = DateOfFirstPositiveLabResult) %>% 
  summarise(freq = n())

FirstDate_daily   <- xts::as.xts(FirstDate$freq, order.by = as.Date(FirstDate$date))
FirstDate_weekly  <- xts::apply.weekly(FirstDate_daily, sum)
FirstDate_monthly <- xts::apply.monthly(FirstDate_daily, sum)

par(mfrow = c(3,1))
plot(FirstDate_daily, col = "cadetblue4", lwd = 3,
     main = "Date of first positive lab result - daily")
plot(FirstDate_weekly, col = "cadetblue4", lwd = 3,
     main = "Date of first positive lab result - weekly")
plot(FirstDate_monthly, col = "cadetblue4", lwd = 3,
     main = "Date of first positive lab result - monhly")

par(mfrow = c(1,1))

# Clear aux variables
rm(list = ls()[grep("^FirstDate", ls())])

Comorbidities

comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
comorbidities <- sapply(comorbidities, sum) %>% as.data.frame()
comorbidities <- data.frame(comorbidities = rownames(comorbidities),
                            count = comorbidities[,1])

ggplot(data = comorbidities, aes(x = reorder(comorbidities, count), y = count)) +
  geom_bar(stat = 'identity', color = "cadetblue4", fill = "azure3", width = 0.85) +
  geom_text(aes(label = count), hjust = -0.3, color = "slategray4", size = 3.5) +
  coord_flip() +
  ggtitle(label = "Comorbidities",
          subtitle = "Frequency in 2688 patients") +
  labs(x = "", y = "")

rm(comorbidities)






Radar Charts


Comorbidities

comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
comorbidities <- sapply(comorbidities, sum) %>% t() %>% as.data.frame()
comorbidities <- comorbidities / nrow(BD_chusj) # convert to percentage

# Find max and round it (ceiling) to the nearest .1 multiple
max.axis <- plyr::round_any(max(comorbidities), 0.1, f = ceiling)

# Prepare data for radar chart
comorbidities <- rbind(max = rep(max.axis, 15),
                       min = rep(0, 15),
                       value = comorbidities)

radarchart(comorbidities,
           axistype = 1,
           # custom polygon
           pcol = rgb(0.2, 0.5, 0.5, 0.9),  # line color
           pfcol = rgb(0.2, 0.5, 0.5, 0.2), # fill-in color
           plwd = 2,                        # line width
           # custom the grid
           cglcol = "grey20",   # net color
           cglty = 3,           # net line type
           axislabcol = "grey50", # net label color
           cglwd = 0.8,         # net width
           caxislabels = scales::percent(seq(0, max.axis, length.out = 5)),
           # custom labels
           calcex = 0.9, # font size of net labels
           vlcex = 0.8, # font size of outer labels
           title = "Overall frequency of comorbidities (%)")


Comorbidities vs ATC families

#' ATCs and Comorbidities
ATC_lv1 <- BD_chusj[, c(469, 378:390)]
comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
ATCs_comorbidities <- cbind(ATC_lv1, comorbidities)

# Create an empty dataset
ATC.Comorbs_matrix <- data.frame(matrix(NA,
                                        ncol = ncol(comorbidities),
                                        nrow = ncol(ATC_lv1)))
colnames(ATC.Comorbs_matrix) <- names(comorbidities)
rownames(ATC.Comorbs_matrix) <- names(ATC_lv1)

# Compute matrix
for (atc in rownames(ATC.Comorbs_matrix)) {
  for (comorb in colnames(ATC.Comorbs_matrix)) {
    ATC.Comorbs_matrix[atc, comorb] <- 
      ATCs_comorbidities[ATCs_comorbidities[,atc] == 1 & ATCs_comorbidities[,comorb] == 1,
                         c(atc, comorb)] %>% nrow()
  }
}
rm(atc, comorb, ATCs_comorbidities, comorbidities)


# Check Top-N ATC families
ATCs <- sapply(ATC_lv1, sum) %>% as.data.frame() %>% arrange(desc(.))

# Select Top-n ATC families
ATCs <- ATCs %>% head(4)

# Prepare data for radar chart
radar.data <- ATC.Comorbs_matrix[rownames(ATCs), ]

# Find max and round it (ceiling) to the nearest .1 multiple
max.axis <- plyr::round_any(max(radar.data), 50, f = ceiling)

radar.data <- rbind(max = rep(max.axis, ncol(radar.data)),
                    min = rep(0, ncol(radar.data)),
                    radar.data)

# Define colors
colors <- RColorBrewer::brewer.pal(nrow(radar.data) - 2, "Set1")
colors_border <- colors
colors_in <- scales::alpha(colors, alpha = 0.15)

radarchart(radar.data,
           axistype = 1,
           # custom polygon
           pcol = colors_border,  # line color
           pfcol = colors_in,     # fill-in color
           plwd = 2,              # line width
           plty = 1,              # line type
           # custom the grid
           cglcol = "grey20",   # net color
           cglty = 3,  # net line type
           axislabcol = "grey50", # net label color
           cglwd = 0.8,         # net width
           caxislabels = seq(0, max.axis, length.out = 5),
           # custom labels
           calcex = 0.8,
           vlcex = 0.7,
           title = "Frequency of top-4 ATC groups per Comorbidity (n)")

legend(x = 1.1,
       y = -0.7,
       legend = rownames(radar.data[-c(1,2),]),
       bty = "n",
       pch = 20,
       col = colors_border,
       text.col = "grey40",
       cex = 0.9,
       pt.cex = 2)


Comorbidities vs Sex

comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
comorbidities <- cbind(Sex = BD_chusj$Gender,
                       comorbidities)
comorbidities <- rbind(
  # total = comorbidities %>% sapply(., sum) %>% t() %>% as.data.frame(),
  male   = comorbidities %>% filter(Sex == 1) %>% sapply(., sum) %>% t() %>% as.data.frame(),
  female = comorbidities %>% filter(Sex == 0) %>% sapply(., sum) %>% t() %>% as.data.frame())
comorbidities$Sex <- NULL

# convert to percentage
comorbidities <- comorbidities / nrow(BD_chusj) 

# Find max and round it (ceiling) to the nearest .1 multiple
max.axis <- plyr::round_any(max(comorbidities), 0.1, f = ceiling)

# Prepare data for radar chart
comorbidities <- rbind(max = rep(max.axis, length(comorbidities)),
                       min = rep(0, length(comorbidities)),
                       comorbidities)

# Define colors
# colors <- RColorBrewer::brewer.pal(3, "BuPu")
colors <- c("dodgerblue3", "hotpink1")
colors_border <- colors
colors_in <- c(scales::alpha(colors[1], 0.15), scales::alpha(colors[2], 0.15))

radarchart(comorbidities,
           axistype = 1,
           # custom polygon
           pcol = colors_border,  # line color
           pfcol = colors_in,     # fill-in color
           plwd = 2,              # line width
           plty = 1,              # line type
           # custom the grid
           cglcol = "grey20",   # net color
           cglty = 3,  # net line type
           axislabcol = "grey50", # net label color
           cglwd = 0.8,         # net width
           caxislabels = scales::percent(seq(0, max.axis, length.out = 5)),
           # custom labels
           calcex = 0.8,
           vlcex = 0.7,
           title = "Frequency of Comorbidities per Sex (%)")

legend(x = 1.1,
       y = -1,
       legend = rownames(comorbidities[-c(1,2),]),
       bty = "n",
       pch = 20,
       col = colors_border,
       text.col = "grey40",
       cex = 0.9,
       pt.cex = 2)

{
  cat("Frequencies in percentage (%):\n")
  (comorbidities[-c(1:2),] * 100) %>% round(., digits = 1) %>% 
    t() %>% as.data.frame() %>% print()
}
## Frequencies in percentage (%):
##                   male female
## CANC              17.2   14.0
## CEREBROVASCULAR    6.0    4.6
## DIAB              16.9   12.4
## KIDNEY             9.3    6.3
## LIVER              5.1    2.1
## LUNG_I             7.1    1.5
## LUNG_II            5.8    3.9
## HEART             11.7    7.6
## TRANSP             1.2    0.5
## OBESITY            8.9   10.8
## SMOKING           15.6    1.6
## NERVOUS           14.4   12.8
## ASTHMA             1.7    2.5
## HYPERTENSION       1.7    2.5
## PreconditionOther  8.3    4.4


Comorbidities in Intensive Care

comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
comorbidities <- cbind(IntensiveCare = BD_chusj$IntensiveCare,
                       comorbidities)
comorbidities <- rbind(
  # total = comorbidities %>% sapply(., sum) %>% t() %>% as.data.frame(),
  IC_no = comorbidities %>% filter(IntensiveCare == 0) %>% sapply(., sum) %>% t() %>% as.data.frame(),
  IC_yes = comorbidities %>% filter(IntensiveCare == 1) %>% sapply(., sum) %>% t() %>% as.data.frame())
comorbidities$IntensiveCare <- NULL

# convert to percentage
comorbidities <- comorbidities / nrow(BD_chusj)
# comorbidities %>% t() %>% round(., digits = 3) %>% as.data.frame() %>% print()

# Find max and round it (ceiling) to the nearest .1 multiple
max.axis <- plyr::round_any(max(comorbidities), 0.1, f = ceiling)

# Prepare data for radar chart
comorbidities <- rbind(max = rep(max.axis, length(comorbidities)),
                       min = rep(0, length(comorbidities)),
                       comorbidities)

# Define colors
# colors <- RColorBrewer::brewer.pal(3, "BuPu")
colors <- c("seagreen4", "indianred3")
colors_border <- colors
colors_in <- c(scales::alpha(colors[1], 0.3), scales::alpha(colors[2], 0.6))

radarchart(comorbidities,
           axistype = 1,
           # custom polygon
           pcol = colors_border,  # line color
           pfcol = colors_in,     # fill-in color
           plwd = 2,              # line width
           plty = 1,              # line type
           # custom the grid
           cglcol = "grey20",   # net color
           cglty = 3,  # net line type
           axislabcol = "grey50", # net label color
           cglwd = 0.8,         # net width
           caxislabels = scales::percent(seq(0, max.axis, length.out = 5)),
           # custom labels
           calcex = 0.8,
           vlcex = 0.7,
           title = "Frequency of Comorbidities in Intensive Care(%)")

legend(x = 1.1,
       y = -1,
       legend = rownames(comorbidities[-c(1,2),]),
       bty = "n",
       pch = 20,
       col = colors_border,
       text.col = "grey40",
       cex = 0.9,
       pt.cex = 2)

{
  cat("Frequencies in percentage (%):\n")
  (comorbidities[-c(1:2),] * 100) %>% round(., digits = 1) %>% 
    t() %>% as.data.frame() %>% print()
}
## Frequencies in percentage (%):
##                   IC_no IC_yes
## CANC               21.3    9.9
## CEREBROVASCULAR     8.1    2.5
## DIAB               19.6    9.7
## KIDNEY             11.2    4.4
## LIVER               4.1    3.0
## LUNG_I              5.7    2.9
## LUNG_II             3.5    6.2
## HEART              13.9    5.4
## TRANSP              0.9    0.7
## OBESITY            10.4    9.3
## SMOKING             9.9    7.3
## NERVOUS            20.5    6.7
## ASTHMA              2.9    1.3
## HYPERTENSION        2.9    1.3
## PreconditionOther   8.9    3.8


Comorbidities vs Outcome

comorbidities <- BD_chusj[,c(7:18, 20:21, 23)]
comorbidities <- cbind(Outcome = BD_chusj$Outcome,
                       comorbidities)

comorbidities <- rbind(
  # total = comorbidities %>% sapply(., sum) %>% t() %>% as.data.frame(),
  Recovered     = comorbidities %>% filter(Outcome == 0) %>% sapply(., sum) %>% t() %>% as.data.frame(),
  Died          = comorbidities %>% filter(Outcome == 1) %>% sapply(., sum) %>% t() %>% as.data.frame(),
  OtherHospital = comorbidities %>% filter(Outcome == 2) %>% sapply(., sum) %>% t() %>% as.data.frame())
comorbidities$Outcome <- NULL

# convert to percentage
comorbidities <- comorbidities / nrow(BD_chusj) 

# Find max and round it (ceiling) to the nearest .1 multiple
max.axis <- plyr::round_any(max(comorbidities), 0.1, f = ceiling)

# Prepare data for radar chart
comorbidities <- rbind(max = rep(max.axis, length(comorbidities)),
                       min = rep(0, length(comorbidities)),
                       comorbidities)

# Define colors
# colors <- RColorBrewer::brewer.pal(3, "BuPu")
colors <- c("forestgreen", "firebrick3", "dodgerblue3")
colors_border <- colors
colors_in <- c(scales::alpha(colors[1], 0.15), scales::alpha(colors[2], 0.15))

radarchart(comorbidities,
           axistype = 1,
           # custom polygon
           pcol = colors_border,  # line color
           pfcol = colors_in,     # fill-in color
           plwd = 2,              # line width
           plty = 1,              # line type
           # custom the grid
           cglcol = "grey20",   # net color
           cglty = 3,  # net line type
           axislabcol = "grey50", # net label color
           cglwd = 0.8,         # net width
           caxislabels = scales::percent(seq(0, max.axis, length.out = 5)),
           # custom labels
           calcex = 0.8,
           vlcex = 0.7,
           title = "Frequency of Comorbidities per Outcome (%)")

legend(x = 1.1,
       y = -0.8,
       legend = rownames(comorbidities[-c(1,2),]),
       bty = "n",
       pch = 20,
       col = colors_border,
       text.col = "grey40",
       cex = 0.9,
       pt.cex = 2)

{
  cat("Frequencies in percentage (%):\n")
  (comorbidities[-c(1:2),] * 100) %>% round(., digits = 1) %>% 
    t() %>% as.data.frame() %>% print()
}
## Frequencies in percentage (%):
##                   Recovered Died OtherHospital
## CANC                   19.6  8.5           3.0
## CEREBROVASCULAR         6.0  3.2           1.4
## DIAB                   19.6  7.0           2.6
## KIDNEY                  9.2  4.8           1.6
## LIVER                   4.8  1.7           0.6
## LUNG_I                  5.0  2.6           1.0
## LUNG_II                 4.8  3.0           2.0
## HEART                  11.0  6.0           2.2
## TRANSP                  1.3  0.3           0.0
## OBESITY                12.9  3.6           3.1
## SMOKING                10.6  4.2           2.3
## NERVOUS                15.8  8.1           3.2
## ASTHMA                  3.4  0.4           0.4
## HYPERTENSION            3.4  0.4           0.4
## PreconditionOther       8.8  2.2           1.5






Heatmaps

ATC families, 1st level

#' ATC codes (first level) are listed in variables "469 + 378-390"
ATC_lv1 <- BD_chusj[, c(469, 378:390)]

# Create an empty dataset
heatmap.matrix <- data.frame(matrix(NA, ncol = length(ATC_lv1), nrow = length(ATC_lv1)))
colnames(heatmap.matrix) <- rownames(heatmap.matrix) <- names(ATC_lv1)

# Fill in dataset with count of co-occurences
for (i in 1:length(ATC_lv1)) {
  for (j in i:length(ATC_lv1)) {
    heatmap.matrix[i,j] <- heatmap.matrix[j,i] <- nrow(ATC_lv1[,c(i,j)][ATC_lv1[,i]==1 & ATC_lv1[,j]==1,])
  }
}; rm(i, j)

# Reshape data matrix to unrolled data, to feed heatmap
heatmap.data <- reshape2::melt(as.matrix(heatmap.matrix))
colnames(heatmap.data) <- c("x", "y", "value") 

# Change co-occurance counts to to percentages
heatmap.data$value <- round(heatmap.data$value / nrow(ATC_lv1), digits = 3)

# Plot graphic
ggplot(heatmap.data, aes(x, y, fill = value)) + 
  geom_tile(color = "grey95",
            lwd = 1,
            linetype = 1) + 
  theme(axis.text.x = element_text(angle = 90),
        axis.title.x = element_blank(),
        axis.title.y = element_blank()) +
  ggtitle(label = "Heatmap",
          subtitle = "Co-frequencies in 1st-level ATC drug groups") +
  geom_text(aes(label = value), color = "gray30", size = 3) +
  scale_fill_gradient(low = "lightsteelblue1", high = "darkolivegreen3") +
  guides(fill = guide_colourbar(barwidth = 0.5,
                                barheight = 20,
                                title = "%"))

  • ATC.B is the single most prevalent ATC drug family, being used by 89.1% of patients.

  • ATC.A + ATC.B is the most prevalent drug family dual association, being jointly used by 66.6% of patients.


Find largest values of co-frequencies

groups <- colnames(heatmap.matrix)
group.combinations <- combn(groups, 2) %>% as.data.frame()

cofrequencies <- 
  sapply(1:ncol(group.combinations),
         FUN = function(n){
           heatmap.data %>%
             filter(x == group.combinations[1,n],
                    y == group.combinations[2,n]) %>%
             select(value)
         })

group.combinations <- 
  cbind(t(group.combinations), cofrequencies) %>% 
  as.data.frame() %>%
  lapply(., as.character) %>%
  as.data.frame()

group.combinations$cofrequencies <- as.numeric(group.combinations$cofrequencies)
group.combinations %>% arrange(desc(cofrequencies)) %>% head(10) %>% print
##       V1    V2 cofrequencies
## 1  ATC.A ATC.B         0.666
## 2  ATC.B ATC.N         0.556
## 3  ATC.B ATC.J         0.524
## 4  ATC.A ATC.N         0.507
## 5  ATC.B ATC.R         0.495
## 6  ATC.A ATC.J         0.464
## 7  ATC.J ATC.N         0.440
## 8  ATC.B ATC.C         0.400
## 9  ATC.A ATC.R         0.396
## 10 ATC.A ATC.C         0.360

The most frequent dual combinations are the following, all of which occurring in more than 50% of patients:

  • ATC.A + ATC.B : 66,6 %

  • ATC.B + ATC.N : 55,6 %

  • ATC.B + ATC.J : 52,4 %

  • ATC.A + ATC.N : 50,7 %


ATC groups 2nd level

heatmap_2groups <- function(index1 = NA,
                            index2 = NA) {
  
  #' This function plots a heatmap of 2 ATC groups at 2nd level
  #' inputs: the indexes on the 2nd levels codes for each group
  #'   ATC.A: 362:377 (16 codes)
  #'   ATC.B: 391:395 (5 codes)
  #'   ATC.J: 425:430 (6 codes)
  #'   ATC.N: 441:447 (7 codes)
  
  ATC_group1 <- BD_chusj[, index1]
  ATC_group2 <- BD_chusj[, index2]
  
  # Create an empty dataset
  heatmap.matrix <- data.frame(matrix(NA,
                                      ncol = ncol(ATC_group1),
                                      nrow = ncol(ATC_group2)))
  colnames(heatmap.matrix) <- names(ATC_group1)
  rownames(heatmap.matrix) <- names(ATC_group2)
  
  ATC_groups12 <- cbind(ATC_group1, ATC_group2)
  for (g1 in seq_along(ATC_group1)) {
    for (g2 in seq_along(ATC_group2)) {
      heatmap.matrix[g2, g1] <- 
        nrow(ATC_groups12[, c(g1, g2 + ncol(ATC_group1))][ATC_groups12[,g1]==1 &
                                                                    ATC_groups12[,g2+ncol(ATC_group1)]==1,])
    }
  }
  
  heatmap.data <- reshape2::melt(as.matrix(heatmap.matrix))
  colnames(heatmap.data) <- c("x", "y", "value")
  
  # change values to percentage
  # heatmap.data$value <- round(heatmap.data$value / nrow(ATC_lv1), digits = 3)
  
  family.group1 <- substring(names(ATC_group1)[1], 5, 5)
  family.group2 <- substring(names(ATC_group2)[1], 5, 5)
  
  p <- 
    ggplot(heatmap.data, aes(x, y, fill = value)) + 
    geom_tile(color = "grey95",
              lwd = 1,
              linetype = 1) +
    theme(axis.text.x = element_text(angle = 0),
          axis.title.x = element_blank(),
          axis.title.y = element_blank()) +
    ggtitle(label = "Heatmap",
            subtitle = paste("Co-frequencies in 2nd-level ATC drug groups",
                             family.group1, "and", family.group2)) +
    geom_text(aes(label = value), color = "gray30", size = 3) +
    scale_fill_gradient(low = "lightsteelblue1", high = "darkolivegreen3") +
    # coord_fixed() +
    guides(fill = guide_colourbar(barwidth = 0.5,
                                  barheight = 20,
                                  title = "freq"))

  # Find most prevalent combinations
  heatmap.data %>%
    arrange(desc(value)) %>%
    mutate(perc = round(value / nrow(BD_chusj), 3)) %>%
    relocate(y) %>%
    rename(!!family.group1 := y, !!family.group2 := x, freq = value) %>%
    head() %>% print()
  
  return(p)
}


Groups A + B

heatmap_2groups(index1 = 362:377, index2 = 391:395) # A.B
##         A       B freq  perc
## 1 ATC.A02 ATC.B01  894 0.333
## 2 ATC.A02 ATC.B05  850 0.316
## 3 ATC.A03 ATC.B01  832 0.310
## 4 ATC.A10 ATC.B01  811 0.302
## 5 ATC.A03 ATC.B05  770 0.286
## 6 ATC.A10 ATC.B05  685 0.255


Groups B + N

heatmap_2groups(index1 = 391:395, index2 = 441:447) # B.N
##         B       N freq  perc
## 1 ATC.B01 ATC.N02 1233 0.459
## 2 ATC.B05 ATC.N02 1116 0.415
## 3 ATC.B01 ATC.N05  664 0.247
## 4 ATC.B05 ATC.N05  642 0.239
## 5 ATC.B01 ATC.N01  480 0.179
## 6 ATC.B05 ATC.N01  447 0.166


Groups B + J

heatmap_2groups(index1 = 391:395, index2 = 425:430) # B.J
##         B       J freq  perc
## 1 ATC.B01 ATC.J01 1293 0.481
## 2 ATC.B05 ATC.J01 1110 0.413
## 3 ATC.B03 ATC.J01  203 0.076
## 4 ATC.B02 ATC.J01   93 0.035
## 5 ATC.B01 ATC.J05   70 0.026
## 6 ATC.B05 ATC.J02   68 0.025


Groups A + N

heatmap_2groups(index1 = 362:377, index2 = 441:447) # A.N
##         A       N freq  perc
## 1 ATC.A03 ATC.N02  830 0.309
## 2 ATC.A02 ATC.N02  810 0.301
## 3 ATC.A10 ATC.N02  583 0.217
## 4 ATC.A02 ATC.N05  494 0.184
## 5 ATC.A03 ATC.N05  484 0.180
## 6 ATC.A02 ATC.N01  423 0.157